### Import all necessary Libraries
import numpy as np
import pandas as pd
import seaborn as sns
from plotnine import *
import matplotlib.pyplot as plt
sample = pd.read_csv("SampleSuperstore.csv")
sample.head()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 |
sample.shape
(9994, 13)
sample.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9994 entries, 0 to 9993 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Ship Mode 9994 non-null object 1 Segment 9994 non-null object 2 Country 9994 non-null object 3 City 9994 non-null object 4 State 9994 non-null object 5 Postal Code 9994 non-null int64 6 Region 9994 non-null object 7 Category 9994 non-null object 8 Sub-Category 9994 non-null object 9 Sales 9994 non-null float64 10 Quantity 9994 non-null int64 11 Discount 9994 non-null float64 12 Profit 9994 non-null float64 dtypes: float64(3), int64(2), object(8) memory usage: 1015.1+ KB
sample.describe()
| Postal Code | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|
| count | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 | 9994.000000 |
| mean | 55190.379428 | 229.858001 | 3.789574 | 0.156203 | 28.656896 |
| std | 32063.693350 | 623.245101 | 2.225110 | 0.206452 | 234.260108 |
| min | 1040.000000 | 0.444000 | 1.000000 | 0.000000 | -6599.978000 |
| 25% | 23223.000000 | 17.280000 | 2.000000 | 0.000000 | 1.728750 |
| 50% | 56430.500000 | 54.490000 | 3.000000 | 0.200000 | 8.666500 |
| 75% | 90008.000000 | 209.940000 | 5.000000 | 0.200000 | 29.364000 |
| max | 99301.000000 | 22638.480000 | 14.000000 | 0.800000 | 8399.976000 |
#Checking Missing Values
sample.isnull().sum()
Ship Mode 0 Segment 0 Country 0 City 0 State 0 Postal Code 0 Region 0 Category 0 Sub-Category 0 Sales 0 Quantity 0 Discount 0 Profit 0 dtype: int64
# Chaecking the dupilication in data
sample.duplicated().sum()
17
sample.drop_duplicates()
| Ship Mode | Segment | Country | City | State | Postal Code | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | 42420 | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | 90036 | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | 33311 | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9989 | Second Class | Consumer | United States | Miami | Florida | 33180 | South | Furniture | Furnishings | 25.2480 | 3 | 0.20 | 4.1028 |
| 9990 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Furniture | Furnishings | 91.9600 | 2 | 0.00 | 15.6332 |
| 9991 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Technology | Phones | 258.5760 | 2 | 0.20 | 19.3932 |
| 9992 | Standard Class | Consumer | United States | Costa Mesa | California | 92627 | West | Office Supplies | Paper | 29.6000 | 4 | 0.00 | 13.3200 |
| 9993 | Second Class | Consumer | United States | Westminster | California | 92683 | West | Office Supplies | Appliances | 243.1600 | 2 | 0.00 | 72.9480 |
9977 rows × 13 columns
sample.nunique()
Ship Mode 4 Segment 3 Country 1 City 531 State 49 Postal Code 631 Region 4 Category 3 Sub-Category 17 Sales 5825 Quantity 14 Discount 12 Profit 7287 dtype: int64
#Deleting the Variable.
col=['Postal Code']
sample1=sample.drop(columns=col,axis=1)
#Correlation Between Variables.
sample1.corr()
| Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|
| Sales | 1.000000 | 0.200795 | -0.028190 | 0.479064 |
| Quantity | 0.200795 | 1.000000 | 0.008623 | 0.066253 |
| Discount | -0.028190 | 0.008623 | 1.000000 | -0.219487 |
| Profit | 0.479064 | 0.066253 | -0.219487 | 1.000000 |
# Covariance of columns
sample1.cov()
| Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|
| Sales | 388434.455308 | 278.459923 | -3.627228 | 69944.096586 |
| Quantity | 278.459923 | 4.951113 | 0.003961 | 34.534769 |
| Discount | -3.627228 | 0.003961 | 0.042622 | -10.615173 |
| Profit | 69944.096586 | 34.534769 | -10.615173 | 54877.798055 |
sample1.head()
| Ship Mode | Segment | Country | City | State | Region | Category | Sub-Category | Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Second Class | Consumer | United States | Henderson | Kentucky | South | Furniture | Bookcases | 261.9600 | 2 | 0.00 | 41.9136 |
| 1 | Second Class | Consumer | United States | Henderson | Kentucky | South | Furniture | Chairs | 731.9400 | 3 | 0.00 | 219.5820 |
| 2 | Second Class | Corporate | United States | Los Angeles | California | West | Office Supplies | Labels | 14.6200 | 2 | 0.00 | 6.8714 |
| 3 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | South | Furniture | Tables | 957.5775 | 5 | 0.45 | -383.0310 |
| 4 | Standard Class | Consumer | United States | Fort Lauderdale | Florida | South | Office Supplies | Storage | 22.3680 | 2 | 0.20 | 2.5164 |
plt.figure(figsize=(16,8))
plt.bar('Sub-Category','Category', data=sample1)
plt.title('Category vs Sub Category')
plt.xlabel('Sub-Catgory')
plt.ylabel('Category')
plt.xticks(rotation=45)
plt.show()
sample1.corr()
| Sales | Quantity | Discount | Profit | |
|---|---|---|---|---|
| Sales | 1.000000 | 0.200795 | -0.028190 | 0.479064 |
| Quantity | 0.200795 | 1.000000 | 0.008623 | 0.066253 |
| Discount | -0.028190 | 0.008623 | 1.000000 | -0.219487 |
| Profit | 0.479064 | 0.066253 | -0.219487 | 1.000000 |
sample1.hist(bins=50 ,figsize=(20,15))
plt.show();
# Count the total repeatable states
sample1['State'].value_counts()
California 2001 New York 1128 Texas 985 Pennsylvania 587 Washington 506 Illinois 492 Ohio 469 Florida 383 Michigan 255 North Carolina 249 Arizona 224 Virginia 224 Georgia 184 Tennessee 183 Colorado 182 Indiana 149 Kentucky 139 Massachusetts 135 New Jersey 130 Oregon 124 Wisconsin 110 Maryland 105 Delaware 96 Minnesota 89 Connecticut 82 Oklahoma 66 Missouri 66 Alabama 61 Arkansas 60 Rhode Island 56 Utah 53 Mississippi 53 Louisiana 42 South Carolina 42 Nevada 39 Nebraska 38 New Mexico 37 Iowa 30 New Hampshire 27 Kansas 24 Idaho 21 Montana 15 South Dakota 12 Vermont 11 District of Columbia 10 Maine 8 North Dakota 7 West Virginia 4 Wyoming 1 Name: State, dtype: int64
plt.figure(figsize=(15,15))
sns.countplot(x=sample1['State'])
plt.xticks(rotation=90)
plt.title("STATE")
plt.show()
Profit_plot = (ggplot(sample, aes(x='Sub-Category', y='Profit', fill='Sub-Category')) + geom_col() + coord_flip()
+ scale_fill_brewer(type='div', palette="Spectral") + theme_classic() + ggtitle('Pie Chart'))
display(Profit_plot)
C:\Users\ANAS MUSTAQEEM\AppData\Local\Programs\Python\Python39\lib\site-packages\mizani\palettes.py:397: UserWarning: Warning message:Brewer palette Spectral has a maximum of 11 colors Returning the palette you asked for with that many colors C:\Users\ANAS MUSTAQEEM\AppData\Local\Programs\Python\Python39\lib\site-packages\mizani\palettes.py:397: UserWarning: Warning message:Brewer palette Spectral has a maximum of 11 colors Returning the palette you asked for with that many colors
<ggplot: (147365171950)>
Above Pie chart Shows the profit and loss of each and every subcategories.Here from graph we can visualize that "binders" sub-category has suffered the highest amount of loss and also profit amongst all other sub-Categories (For now we can't say that what is the reason it may be because of discounts given on binders subcategory)
sns.set(style="whitegrid")
plt.figure(2, figsize=(20,15))
sns.barplot(x='Sub-Category',y='Profit', data=sample, palette='Spectral')
plt.suptitle('Pie Consumption Patterns in the United States', fontsize=16)
plt.show()
ggplot(sample, aes(x='Ship Mode', fill = 'Category')) + geom_bar(stat = 'count')
<ggplot: (147365334324)>
figsize=(15,10)
sns.pairplot(sample1,hue='Sub-Category')
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
flip_xlabels = theme(axis_text_x = element_text(angle=90, hjust=1),figure_size=(10,5),
axis_ticks_length_major=10,axis_ticks_length_minor=5)
(ggplot(sample, aes(x='Sub-Category', fill='Sales')) + geom_bar() + facet_wrap(['Segment'])
+ flip_xlabels +theme(axis_text_x = element_text(size=12))+ggtitle("Sales From Every Segment Of United States of Whole Data"))
<ggplot: (147236996337)>
From above Graph we can say that "Home Office" segment has less purchased Sub-Categories and in that "Tables","Supplies","Machines","Copiers","Bookcases" has the lowest Sales. "Consumer" has purchased more sub-categories as compared to other segments.
flip_xlabels = theme(axis_text_x = element_text(angle=90, hjust=1),figure_size=(10,5),
axis_ticks_length_major=10,axis_ticks_length_minor=5)
(ggplot(sample, aes(x='Sub-Category', fill='Discount')) + geom_bar() + facet_wrap(['Segment'])
+ flip_xlabels +theme(axis_text_x = element_text(size=12))+ggtitle("Discount on Categories From Every Segment Of United States of Whole Data"))
<ggplot: (147237110257)>
flip_xlabels = theme(axis_text_x = element_text(angle=90, hjust=10),figure_size=(10,10),
axis_ticks_length_major=50,axis_ticks_length_minor=50)
(ggplot(sample1, aes(x='Category', fill='Sales')) + geom_bar() + theme(axis_text_x = element_text(size=10))
+ facet_wrap(['Region']) + flip_xlabels+ ggtitle("Sales From Every Region Of United States of Whole Data"))
<ggplot: (147365365607)>
plt.figure(figsize=(10,4))
sns.lineplot('Discount','Profit', data=sample1 , color='y',label='Discount')
plt.legend()
plt.show()
C:\Users\ANAS MUSTAQEEM\AppData\Local\Programs\Python\Python39\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
state_code = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','District of Columbia': 'WA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}
sample1['state_code'] = sample1.State.apply(lambda x: state_code[x])
state_data = sample1[['Sales', 'Profit', 'state_code']].groupby(['state_code']).sum()
fig = go.Figure(data=go.Choropleth(
locations=state_data.index,
z = state_data.Sales,
locationmode = 'USA-states',
colorscale = 'Reds',
colorbar_title = 'Sales in USD',
))
fig.update_layout(
title_text = 'Total State-Wise Sales',
geo_scope='usa',
height=800,
)
fig.show()
Now, let us analyze the sales of a few random states from each profit bracket (high profit, medium profit, low profit, low loss and high loss) and try to observe some crucial trends which might help us in increasing the sales.
def state_data_viewer(states):
"""Plots the turnover generated by different product categories and sub-categories for the list of given states.
Args:
states- List of all the states you want the plots for
Returns:
None
"""
product_data = sample1.groupby(['State'])
for state in states:
data = product_data.get_group(state).groupby(['Category'])
fig, ax = plt.subplots(1, 3, figsize = (28,5))
fig.suptitle(state, fontsize=14)
ax_index = 0
for cat in ['Furniture', 'Office Supplies', 'Technology']:
cat_data = data.get_group(cat).groupby(['Sub-Category']).sum()
sns.barplot(x = cat_data.Profit, y = cat_data.index, ax = ax[ax_index])
ax[ax_index].set_ylabel(cat)
ax_index +=1
fig.show()
states = ['California', 'Washington', 'Mississippi', 'Arizona', 'Texas']
state_data_viewer(states)
C:\Users\ANASMU~1\AppData\Local\Temp/ipykernel_41108/1627344209.py:19: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. C:\Users\ANASMU~1\AppData\Local\Temp/ipykernel_41108/1627344209.py:19: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. C:\Users\ANASMU~1\AppData\Local\Temp/ipykernel_41108/1627344209.py:19: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. C:\Users\ANASMU~1\AppData\Local\Temp/ipykernel_41108/1627344209.py:19: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. C:\Users\ANASMU~1\AppData\Local\Temp/ipykernel_41108/1627344209.py:19: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
x = sample.iloc[:, [9, 10, 11, 12]].values
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++',
max_iter = 300, n_init = 10, random_state = 0).fit(x)
wcss.append(kmeans.inertia_)
sns.set_style("whitegrid")
sns.FacetGrid(sample, hue ="Sub-Category",height = 6).map(plt.scatter,'Sales','Quantity')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1],
s = 100, c = 'yellow', label = 'Centroids')
plt.legend()
plt.show()
sns.set_style("whitegrid")
sns.FacetGrid(sample, hue ="Sub-Category",height = 6).map(plt.scatter,'Sales','Profit')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1],
s = 100, c = 'yellow', label = 'Centroids')
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize = (10 , 6))
ax.scatter(sample1["Sales"] , sample1["Profit"])
ax.set_xlabel('Sales')
ax.set_ylabel('Profit')
ax.set_title('Sales vs Profit')
plt.show()
*One thing to be noted is that while the superstore is incurring losses due to giving discounts on its products, they can't stop giving discounts of their products. Most of the heavy discounts are during festivals, end-of-season and clearance sales which are necessary so that the store can make space in their warehouses for fresh stock. Also, by incurring small losses, the company gains in the future by attracting more long term customers. Therefore, the small losses from discounts are an essential part of company's business